from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt
#These are the NLP sentiment tokenizers/models, and can be changed to test alternative models
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
These will depend on which board you want to scrape, how much you want to scrape, the title you want for your output CSVs, etc. Examples are filled in.
board_url = the URL of the board you would like to scrape pages = how many pages of the board you'd like to scrape (check the website for total number of pages if you'd like to do a full scrape) project_title = will define the title for the CSVs outputted by the script
board_url = 'https://forums.studentdoctor.net/forums/ob-gyn.39/'
pages = 60
project_title = 'SDNOBGYN'
Running this will output 6 CSVs: [title]_raw_data.csv: post timestamp + content [title]_data_sentiment.csv: post timestamp + content + sentiment analysis [title]_daily_average.csv: date + daily sentiment average [title]_weekly_average.csv: date + weekly sentiment average [title]_monthly_average.csv: month + monthly sentiment average [title]_yearly_average.csv: year + annual sentiment average
These .csv's may have some posts with outlier datestamps due to stickied posts at the top of the first page, so they will need to be cleaned unless you are running a full scrape of the board from inception to present day.
base_url = 'https://forums.studentdoctor.net/'
forum_url_template = f"{board_url}page-{{}}"
full_post_urls = []
for page_number in range(1, pages):
board_url = forum_url_template.format(page_number)
board_html = requests.get(board_url)
board_soup = BeautifulSoup(board_html.text, 'html.parser')
posts = board_soup.find_all('div', attrs={'class':'structItem-title'})
for url in posts:
link = url.find('a')['href']
full_link = base_url + link
full_post_urls.append(full_link)
def extract_forum_data(url):
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
result_date = [time['datetime'] for time in soup.find_all('time')]
result_text = [div.get_text(separator=' ', strip=True) for div in soup.find_all('div', attrs={'class':'bbWrapper'})]
return list(zip(result_date, result_text))
all_data = [ ]
for url in full_post_urls:
all_data.extend(extract_forum_data(url))
df = pd.DataFrame(all_data, columns=['Date','Content'])
df['Date'] = pd.to_datetime(df['Date'], utc=True)
df.set_index('Date', inplace=True)
df.to_csv(project_title+'_raw_data.csv', index=True)
def sentiment_score(review):
tokens = tokenizer.encode(review, return_tensors='pt')
result = model(tokens)
return int(torch.argmax(result.logits))/2
df['Sentiment'] = df['Content'].apply(lambda x: sentiment_score(x[:512]))
df.to_csv(project_title+'_data_sentiment.csv', index=True)
def resample_data(df, freq):
return df['Sentiment'].resample(freq).mean()
daily_avg = resample_data(df, 'D')
daily_avg.index = daily_avg.index.strftime('%Y-%m-%d')
weekly_avg = resample_data(df, 'W')
weekly_avg.index = weekly_avg.index.strftime('%Y-%m-%d')
monthly_avg = resample_data(df, 'M')
monthly_avg.index = monthly_avg.index.strftime('%Y-%m')
yearly_avg = resample_data(df, 'Y')
yearly_avg.index = yearly_avg.index.strftime('%Y')
daily_avg.to_csv(project_title+'_daily_average.csv', index=True)
weekly_avg.to_csv(project_title+'_weekly_average.csv', index=True)
monthly_avg.to_csv(project_title+'_monthly_average.csv', index=True)
yearly_avg.to_csv(project_title+'_yearly_average.csv', index=True)